Library Imports
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F
from datetime import datetime
from decimal import Decimal
Template
spark = (
    SparkSession.builder
    .master("local")
    .appName("Section 2.5 - Casting Columns to Different Type")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()
)
sc = spark.sparkContext
import os
data_path = "/data/pets.csv"
base_path = os.path.dirname(os.getcwd())
path = base_path + data_path
pets = spark.read.csv(path, header=True)
pets.toPandas()
| id | breed_id | nickname | birthday | age | color | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | King | 2014-11-22 12:30:31 | 5 | brown | 
| 1 | 2 | 3 | Argus | 2016-11-22 10:05:10 | 10 | None | 
| 2 | 3 | 1 | Chewie | 2016-11-22 10:05:10 | 15 | None | 
Casting Columns in Different Types
Sometimes your data can be read in as all unicode/string in which you will need to cast them to the correct type. Or Simply you want to change the type of a column as a part of your transformation.
Option 1 - cast()
(
    pets
    .select('birthday')
    .withColumn('birthday_date', F.col('birthday').cast('date'))
    .withColumn('birthday_date_2', F.col('birthday').cast(T.DateType()))
    .toPandas()
)
| birthday | birthday_date | birthday_date_2 | |
|---|---|---|---|
| 0 | 2014-11-22 12:30:31 | 2014-11-22 | 2014-11-22 | 
| 1 | 2016-11-22 10:05:10 | 2016-11-22 | 2016-11-22 | 
| 2 | 2016-11-22 10:05:10 | 2016-11-22 | 2016-11-22 | 
What Happened?
There are 2 ways that you can cast a column.
- Use a string (cast('date')).
- Use the spark types (cast(T.DateType())).
I tend to use a string as it's shorter, one less import and in more editors there will be syntax highlighting for the string.
Summary
- We learnt about two ways of casting a column.
- The first way is a bit more cleaner IMO.